import pandas as pd
import os
import numpy as np
#import pandas_profiling
path = 'DataSet/Original'
file = 'demographics.csv'
demographics = pd.read_csv(path + '/' + file, sep = ',', doublequote = True)
demographics.shape
(188, 33)
demographics.head()
| nta_name | borough | nta_code | population | under_5_years | 5-9_years | 10-14_years | 15-19_years | 20-24_years | 25-29_years | ... | 15000_to_24999 | 25000_to_34999 | 35000_to_49999 | 50000_to_74999 | 75000_to_99999 | 100000_to_149999 | 150000_to_199999 | 200000_or_more | median_income | mean_income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Allerton-Pelham Gardens | Bronx | BX31 | 28903 | 1679 | 1706 | 1763 | 2039 | 1964 | 1703 | ... | 797 | 773 | 1160 | 1764 | 1155 | 1562 | 765 | 427 | 61638 | 78489 |
| 1 | Annadale-Huguenot-Prince's Bay-Eltingville | Staten Island | SI01 | 27770 | 1397 | 1698 | 1817 | 1880 | 1720 | 1594 | ... | 571 | 405 | 1008 | 1523 | 1346 | 2075 | 1086 | 1151 | 88288 | 109187 |
| 2 | Arden Heights | Staten Island | SI48 | 25238 | 1507 | 1540 | 1596 | 1752 | 1614 | 1561 | ... | 337 | 516 | 707 | 1421 | 1611 | 2021 | 1047 | 740 | 89570 | 101627 |
| 3 | Astoria | Queens | QN70 | 78793 | 3480 | 3037 | 3060 | 3392 | 6630 | 11586 | ... | 3673 | 2816 | 4725 | 6463 | 4557 | 4698 | 1627 | 1197 | 54882 | 70094 |
| 4 | Auburndale | Queens | QN48 | 19996 | 917 | 966 | 1063 | 1168 | 1214 | 1307 | ... | 445 | 632 | 690 | 1417 | 1060 | 1237 | 589 | 433 | 70772 | 84402 |
5 rows × 33 columns
demographics.borough.value_counts()
Queens 56 Brooklyn 50 Bronx 36 Manhattan 28 Staten Island 18 Name: borough, dtype: int64
demographics['borough'] = demographics['borough'].astype('category')
demographics.dtypes
nta_name object borough category nta_code object population int64 under_5_years int64 5-9_years int64 10-14_years int64 15-19_years int64 20-24_years int64 25-29_years int64 30-34_years int64 35-39_years int64 40-44_years int64 45-49_years int64 50-54_years int64 55-59_years int64 60-64_years int64 over_65_years int64 median_age int64 people_per_acre float64 households int64 less_than_10,000 int64 10000_to_14999 int64 15000_to_24999 int64 25000_to_34999 int64 35000_to_49999 int64 50000_to_74999 int64 75000_to_99999 int64 100000_to_149999 int64 150000_to_199999 int64 200000_or_more int64 median_income int64 mean_income int64 dtype: object
demographics.profile_report()
demographics.describe()
| population | under_5_years | 5-9_years | 10-14_years | 15-19_years | 20-24_years | 25-29_years | 30-34_years | 35-39_years | 40-44_years | ... | 15000_to_24999 | 25000_to_34999 | 35000_to_49999 | 50000_to_74999 | 75000_to_99999 | 100000_to_149999 | 150000_to_199999 | 200000_or_more | median_income | mean_income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 188.000000 | 188.00000 | 188.000000 | 188.000000 | 188.000000 | 188.000000 | 188.000000 | 188.000000 | 188.000000 | 188.000000 | ... | 188.000000 | 188.000000 | 188.000000 | 188.000000 | 188.000000 | 188.000000 | 188.000000 | 188.000000 | 188.000000 | 188.000000 |
| mean | 43397.175532 | 2752.12766 | 2515.478723 | 2489.159574 | 2842.632979 | 3404.617021 | 3872.106383 | 3512.468085 | 3116.265957 | 3007.898936 | ... | 1750.861702 | 1487.734043 | 1909.824468 | 2544.452128 | 1803.074468 | 2078.186170 | 939.824468 | 1224.664894 | 57002.489362 | 77808.877660 |
| std | 21288.062949 | 1695.80947 | 1496.808314 | 1459.440306 | 1574.371410 | 1886.482082 | 2462.683806 | 2113.358863 | 1704.685137 | 1531.520696 | ... | 1065.437156 | 893.651778 | 1076.642531 | 1415.221055 | 1041.592900 | 1430.129096 | 890.562915 | 2315.006940 | 24341.286390 | 38791.512605 |
| min | 13354.000000 | 506.00000 | 408.000000 | 326.000000 | 449.000000 | 798.000000 | 736.000000 | 742.000000 | 689.000000 | 743.000000 | ... | 191.000000 | 371.000000 | 496.000000 | 782.000000 | 241.000000 | 201.000000 | 20.000000 | 8.000000 | 20334.000000 | 29720.000000 |
| 25% | 27237.000000 | 1490.50000 | 1443.500000 | 1437.750000 | 1619.000000 | 1929.500000 | 2089.250000 | 1928.000000 | 1835.750000 | 1867.500000 | ... | 930.000000 | 827.750000 | 1139.500000 | 1504.750000 | 1111.000000 | 1220.250000 | 426.500000 | 248.750000 | 39015.000000 | 54656.750000 |
| 50% | 37897.000000 | 2386.00000 | 2228.500000 | 2197.500000 | 2568.000000 | 2956.000000 | 3081.500000 | 2893.000000 | 2555.500000 | 2520.000000 | ... | 1504.500000 | 1209.000000 | 1629.000000 | 2134.000000 | 1580.000000 | 1819.500000 | 760.000000 | 517.500000 | 53114.500000 | 70396.000000 |
| 75% | 54244.750000 | 3579.25000 | 3242.750000 | 3263.750000 | 3717.250000 | 4527.000000 | 5252.750000 | 4611.000000 | 4034.500000 | 3778.500000 | ... | 2235.250000 | 2066.500000 | 2494.250000 | 3117.500000 | 2328.500000 | 2462.500000 | 1068.750000 | 963.000000 | 71430.750000 | 89989.500000 |
| max | 132378.000000 | 14703.00000 | 11971.000000 | 10024.000000 | 9094.000000 | 10046.000000 | 11971.000000 | 11292.000000 | 10406.000000 | 10004.000000 | ... | 5231.000000 | 4549.000000 | 5901.000000 | 7655.000000 | 6109.000000 | 9962.000000 | 6359.000000 | 15031.000000 | 155213.000000 | 311109.000000 |
8 rows × 30 columns
demographics.isna().sum()
nta_name 0 borough 0 nta_code 0 population 0 under_5_years 0 5-9_years 0 10-14_years 0 15-19_years 0 20-24_years 0 25-29_years 0 30-34_years 0 35-39_years 0 40-44_years 0 45-49_years 0 50-54_years 0 55-59_years 0 60-64_years 0 over_65_years 0 median_age 0 people_per_acre 0 households 0 less_than_10,000 0 10000_to_14999 0 15000_to_24999 0 25000_to_34999 0 35000_to_49999 0 50000_to_74999 0 75000_to_99999 0 100000_to_149999 0 150000_to_199999 0 200000_or_more 0 median_income 0 mean_income 0 dtype: int64